***********************************************
* Title: rwanda_jde_table3.do
* Author: Todd Pugatch
* Last update: June 10 2024
* Description: analysis for Blimpo and Pugatch, "Entrepreneurship Education
*	and Teacher Training in Rwanda," Stage 2 Registered report, Journal of 
*	Development Economics
* Inputs: 	rwanda_school_trainattend_jde.dta 
*			headteacher_merge_jde.dta
*			teacher_merge_jde.dta
* Outputs: 	rwanda_jde_table3.[txt/xls/out]
* Notes: produces Table 3
************************************************

* Set environment
local start=`"$S_TIME"'
clear
clear matrix
clear mata
graph drop _all
program drop _all
cap log close
set more off

* Set directories 
*global main "[SET MAIN DIRECTORY HERE]"
	global rawdata "$main/01_data/01_raw"
	global cleandata "$main/01_data/02_clean"
	global dofiles "$main/02_dofiles"
	global results "$main/03_results"
	global temp "$main/04_output"

* begin log file
log using "$temp/rwanda_jde_table3.txt", text replace

*************************************************************************
* FIRST PASS THROUGH: SET UP MULTIPLE HYPOTHESIS CORRECTION
/*Proceed in 3 steps:
	1. Run all regressions for H1. Save coefficients & std. errors.
	2. Calculate q-values for all hypotheses.
	3. Re-run regressions to get output, including q-values.*/
**************************************************************************

* 1. GET COEF/SE/P FOR ALL REGRESSIONS	
* PROGRAM TAKE-UP: teacher training & exchange visit attendance
/*Goal: produce columns 1-2 of Table P2
	Uses Educate! administrative data on program take-up, collapsed to school level.
	Regress training/exchange visit attendance on treatment status, controlling
		for randomization strata (baseline outcome not applicable.
	Note that according to Section 3.5 of JDE Registered Report Stage 1, we will
		do 2SLS if these take-up measures fall below 85%.
	Note also that because take-up is defined at school level, no need to calculate
		Lee bounds for attrition.*/
qui use "$cleandata/rwanda_school_trainattend_jde.dta", clear

local Y "training_any_pct exchange_pct"

/*simple means and differences (unadjusted by strata), to measure compliance*/
orth_out `Y' using "$results/rwanda_jde_table3.xls", by(treatment) se compare count colnum title("program take-up, unadjusted") replace
	
/*regressions*/
local i=1 /*initialize loop*/
foreach y in `Y' {
	qui areg `y' treatment, a(strata) robust
	scalar define H1_c`i'b=_b[treatment]
	scalar define H1_c`i'se=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define H1_c`i'p=2*ttail(e(df_r),abs(t))
	local i=`i'+1
}

/*Was non-compliance driven by discrepancies in treatment assignment between researchers
	and Educate!? (****EXPLORATORY****)*/
orth_out `Y' using "$results/rwanda_jde_table3.xls", by(treatment_educateassgnt) se compare count colnum title("program take-up by Educate! treatment assignment, unadjusted") vappend replace
	
* TEACHER CURRICULAR IMPLEMENTATION
/*Goal: produce columns 3-5 of Table P2
	Uses teacher survey data.
	Regress curricular implementation measure on treatment status, controlling
		for baseline outcome and randomization strata (baseline outcome not applicable.
	Impute values for missing baseline outcome. Include dummy for missing in 
		regression.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).*/

qui use "$cleandata/teacher_merge_jde.dta", clear

local Y "skillslab_sched_el lessonplan_index_el eknowledge_index_el"
local Y0 "skillslab_definition_bl lessonplan_index_bl eknowledge_index_bl"

* prep missing values for teachers without baseline outcome
foreach y0 in `Y0' {
	qui su `y0' if treatment==0 & insample_bl==1
	qui replace `y0'=r(mean) if insample_bl!=1
	qui gen `y0'm=(insample_bl!=1)
	lab var `y0'm "missing value for `y0'"
}

* regressions: Table P2, columns 3-5
/*Column 3*/
qui areg skillslab_sched_el treatment skillslab_definition_bl skillslab_definition_blm, a(strata) robust
scalar define H1_c`i'b=_b[treatment]
scalar define H1_c`i'se=_se[treatment]
scalar define t=_b[treatment]/_se[treatment]
scalar define H1_c`i'p=2*ttail(e(df_r),abs(t))
local i=`i'+1

/*Column 4*/
qui areg lessonplan_index_el treatment lessonplan_index_bl lessonplan_index_blm, a(strata) robust
scalar define H1_c`i'b=_b[treatment]
scalar define H1_c`i'se=_se[treatment]
scalar define t=_b[treatment]/_se[treatment]
scalar define H1_c`i'p=2*ttail(e(df_r),abs(t))
local i=`i'+1

/*Column 5*/
qui areg eknowledge_index_el treatment eknowledge_index_bl eknowledge_index_blm, a(strata) robust
scalar define H1_c`i'b=_b[treatment]
scalar define H1_c`i'se=_se[treatment]
scalar define t=_b[treatment]/_se[treatment]
scalar define H1_c`i'p=2*ttail(e(df_r),abs(t))
local i=`i'+1

* HEAD TEACHER PERCEPTIONS
/*Goal: produce columns 6-7 of Table P2
	Uses teacher survey data.
	Regress curricular implementation measure on treatment status, controlling
		for baseline outcome and randomization strata (baseline outcome not applicable.
	Impute values for missing baseline outcome. Include dummy for missing in 
		regression.*/

qui use "$cleandata/headteacher_merge_jde.dta", clear

local Y "ht_pedagogy_active_index_el goalteach_shld_skill_el"
local Y0 "ht_pedagogy_active_index_bl skillbasedsupport_bl"

* prep missing values for head teachers without baseline outcome
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

* regressions: Table P2, columns 6-7

/*Column 6*/
qui areg ht_pedagogy_active_index_el treatment ht_pedagogy_active_index_bli ht_pedagogy_active_index_blm, a(strata) robust
scalar define H1_c`i'b=_b[treatment]
scalar define H1_c`i'se=_se[treatment]
scalar define t=_b[treatment]/_se[treatment]
scalar define H1_c`i'p=2*ttail(e(df_r),abs(t))
local i=`i'+1

/*Column 7*/
qui areg goalteach_shld_skill_el treatment skillbasedsupport_bli skillbasedsupport_blm, a(strata) robust
scalar define H1_c`i'b=_b[treatment]
scalar define H1_c`i'se=_se[treatment]
scalar define t=_b[treatment]/_se[treatment]
scalar define H1_c`i'p=2*ttail(e(df_r),abs(t))

* 2. CALCULATE Q-VALUES FOR ALL HYPOTHESES
/*Calculate sharpened q-values from Benjamini, Krieger, Yuketeli (2006).
	Use code from fdr_sharpened_qvalues.do from Anderson (2008), available at 
		https://are.berkeley.edu/~mlanderson/ARE_Website/Research.html*/
* prep data: put p-values from above into a dataset
clear
local n=`i'
qui set obs `n'
qui gen hypothesis=1
qui gen column=_n
foreach x in b se pval {
	qui gen double `x'=.
}
forval i=1/`n' {
	qui replace b=H1_c`i'b in `i'
	qui replace se=H1_c`i'se in `i'
	qui replace pval=H1_c`i'p in `i' 
}

* START OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
* Collect the total number of p-values tested
quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Generate the variable that will contain the BKY (2006) sharpened q-values

qui gen bky06_qval = 1 if pval~=.
	
* Set the initial counter to 1 

local qval = 1

/***********************************************************************************
	Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, 
	then checks which hypotheses are rejected at q = 0.999, 
	then checks which hypotheses are rejected at q = 0.998, etc.  
	The loop ends by checking which hypotheses are rejected at q = 0.001. 
***********************************************************************************/

while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	
	* Generate value q'*r/M
	qui gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q'*r/M
	qui gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank1 = reject_temp1*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	
	* Generate value q_2st*r/M
	qui gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	qui gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank2 = reject_temp2*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	qui replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}

quietly sort original_sorting_order

******************************
* END OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
******************************
* list results
list hypothesis column b se pval bky06_qval

* prep results for inclusion in final regression output
forval i=1/`n' {
	scalar define H1_c`i'q=bky06_qval in `i'
}

* 3. RE-RUN REGRESSIONS TO GET OUTPUT, INCLUDING Q-VALUES	
* PROGRAM TAKE-UP: teacher training & exchange visit attendance
/*Goal: produce columns 1-2 of Table P2
	Uses Educate! administrative data on program take-up, collapsed to school level.
	Regress training/exchange visit attendance on treatment status, controlling
		for randomization strata (baseline outcome not applicable.
	Note that according to Section 3.5 of JDE Registered Report Stage 1, we will
		do 2SLS if these take-up measures fall below 85%.
	Note also that because take-up is defined at school level, no need to calculate
		Lee bounds for attrition.
	Include sharpened q-values calculated above.*/

qui use "$cleandata/rwanda_school_trainattend_jde.dta", clear


* regressions 
qui areg training_any_pct treatment, a(strata) robust
qui su training_any_pct if treatment == 0
scalar mu_c = r(mean)
scalar qv = H1_c1q  
outreg2 using "$results/rwanda_jde_table3.xls", excel addstat("control mean", mu_c, "q-value", qv) replace  

qui areg exchange_pct treatment, a(strata) robust
qui su exchange_pct if treatment == 0
scalar mu_c = r(mean)
scalar qv = H1_c1q  
outreg2 using "$results/rwanda_jde_table3.xls", excel addstat("control mean", mu_c, "q-value", qv) append






* TEACHER CURRICULAR IMPLEMENTATION
/*Goal: produce columns 3-5 of Table P2
	Uses teacher survey data.
	Regress curricular implementation measure on treatment status, controlling
		for baseline outcome and randomization strata (baseline outcome not applicable.
	Impute values for missing baseline outcome. Include dummy for missing in 
		regression.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).
	Include sharpened q-values calculated above.*/
qui use "$cleandata/teacher_merge_jde.dta", clear

local Y "skillslab_sched_el lessonplan_index_el eknowledge_index_el"
local Y0 "skillslab_definition_bl lessonplan_index_bl eknowledge_index_bl"

* prep missing values for teachers without baseline outcome
foreach y0 in `Y0' {
	qui su `y0' if treatment==0 & insample_bl==1
	qui replace `y0'=r(mean) if insample_bl!=1
	qui gen `y0'm=(insample_bl!=1)
	lab var `y0'm "missing value for `y0'"
}

* define program for obtaining Lee bounds
program getleebounds
	qui predict e if e(sample), residuals
	qui leebounds e treatment, select(insample_el)
	scalar define lb=_b[lower]
	scalar define ub=_b[upper]
	drop e
end

* regressions: Table P2, columns 3-5
local stat ""control mean",mu_c,"baseline mean",mu_0,"q-value",qv,"lower bound",lb,"upper bound",ub"

/*Column 3*/
qui areg skillslab_sched_el skillslab_definition_bl skillslab_definition_blm, a(strata) robust
getleebounds

local i = 3
qui areg skillslab_sched_el treatment skillslab_definition_bl skillslab_definition_blm, a(strata) robust
qui su skillslab_sched_el if treatment==0 & e(sample)
scalar define mu_c=r(mean)
qui su skillslab_definition_bl if e(sample)
scalar define mu_0=r(mean)
scalar define qv=H1_c`i'q
outreg2 using "$results/rwanda_jde_table3.xls", se excel nolabel nocons addstat("control mean",mu_c,"baseline mean",mu_0,"q-value",qv,"lower bound",lb,"upper bound",ub) append
local i=`i'+1

/*Column 4*/
qui areg lessonplan_index_el lessonplan_index_bl lessonplan_index_blm, a(strata) robust
getleebounds

qui areg lessonplan_index_el treatment lessonplan_index_bl lessonplan_index_blm, a(strata) robust
qui su lessonplan_index_el if treatment==0 & e(sample)
scalar define mu_c=r(mean)
qui su lessonplan_index_bl if e(sample)
scalar define mu_0=r(mean)
scalar define qv=H1_c`i'q
outreg2 treatment using "$results/rwanda_jde_table3.xls", se excel nolabel nocons addstat("control mean",mu_c,"baseline mean",mu_0,"q-value",qv,"lower bound",lb,"upper bound",ub) append
local i=`i'+1

/*Column 5*/
qui areg eknowledge_index_el eknowledge_index_bl eknowledge_index_blm, a(strata) robust
getleebounds

qui areg eknowledge_index_el treatment eknowledge_index_bl eknowledge_index_blm, a(strata) robust
qui su eknowledge_index_el if treatment==0 & e(sample)
scalar define mu_c=r(mean)
qui su eknowledge_index_bl if e(sample)
scalar define mu_0=r(mean)
scalar define qv=H1_c`i'q
outreg2 treatment using "$results/rwanda_jde_table3.xls", se excel nolabel nocons ///
	addstat("control mean",mu_c,"baseline mean",mu_0,"q-value",qv,"lower bound",lb,"upper bound",ub) append
local i=`i'+1

* HEAD TEACHER PERCEPTIONS
/*Goal: produce columns 6-7 of Table P2
	Uses teacher survey data.
	Regress curricular implementation measure on treatment status, controlling
		for baseline outcome and randomization strata (baseline outcome not applicable.
	Impute values for missing baseline outcome. Include dummy for missing in 
		regression.
	Include sharpened q-values calculated above.*/
qui use "$cleandata/headteacher_merge_jde.dta", clear

local Y "ht_pedagogy_active_index_el goalteach_shld_skill_el"
local Y0 "ht_pedagogy_active_index_bl skillbasedsupport_bl"

* prep missing values for head teachers without baseline outcome
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

* regressions: Table P2, columns 6-7

/*Column 6*/
qui areg ht_pedagogy_active_index_el ht_pedagogy_active_index_bli ht_pedagogy_active_index_blm, a(strata) robust
getleebounds

qui areg ht_pedagogy_active_index_el treatment ht_pedagogy_active_index_bli ht_pedagogy_active_index_blm, a(strata) robust
qui su ht_pedagogy_active_index_el if treatment==0 & e(sample)
scalar define mu_c=r(mean)
qui su ht_pedagogy_active_index_bl if e(sample)
scalar define mu_0=r(mean)
scalar define qv=H1_c`i'q
outreg2 treatment using "$results/rwanda_jde_table3.xls", se excel nolabel nocons ///
	addstat("control mean",mu_c,"baseline mean",mu_0,"q-value",qv,"lower bound",lb,"upper bound",ub) append
local i=`i'+1

/*Column 7*/
qui areg goalteach_shld_skill_el skillbasedsupport_bli skillbasedsupport_blm, a(strata) robust
getleebounds

qui areg goalteach_shld_skill_el treatment skillbasedsupport_bli skillbasedsupport_blm, a(strata) robust
qui su goalteach_shld_skill_el if treatment==0 & e(sample)
scalar define mu_c=r(mean)
qui su skillbasedsupport_bl if e(sample)
scalar define mu_0=r(mean)
scalar define qv=H1_c`i'q
outreg2 treatment using "$results/rwanda_jde_table3.xls", se excel nolabel nocons ///
	addstat("control mean",mu_c,"baseline mean",mu_0,"q-value",qv,"lower bound",lb,"upper bound",ub) append

local end=`"$S_TIME"' 
di "`start'"
di "`end'"
log close
